%%capture
!pip install spacy
!pip install gensim
!pip install pyLDAvis
!python -m spacy download en_core_web_sm
import re
import spacy
import pickle
import gensim
import logging
import warnings
import numpy as np
import pandas as pd
import gensim.corpora as corpora
from gensim.models import LdaSeqModel
from gensim.corpora import Dictionary
import pyLDAvis
import pyLDAvis.gensim_models
import matplotlib.pyplot as plt
from pprint import pprint
from nltk.corpus import stopwords
from gensim.models import CoherenceModel
from gensim.utils import simple_preprocess
from nltk.corpus import PlaintextCorpusReader
from gensim.parsing.preprocessing import preprocess_string
%matplotlib inline
pyLDAvis.enable_notebook()
stop_words = stopwords.words('english')
stop_words.extend(['from', 'subject', 're', 'edu', 'use'])
nlp = spacy.load('en_core_web_sm', disable=['parser', 'ner'])
warnings.filterwarnings("ignore", category=FutureWarning)
warnings.filterwarnings("ignore", category=DeprecationWarning)
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.ERROR)
!rm -rf `find -type d -name .ipynb_checkpoints`
# Define functions for stopwords, bigrams, trigrams and lemmatization
def remove_stopwords(texts):
return [[word for word in simple_preprocess(str(doc)) if word not in stop_words] for doc in texts]
def lemmatization(texts, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV']):
texts_out = []
for sent in texts:
doc = nlp(" ".join(sent))
texts_out.append([token.lemma_ for token in doc if token.pos_ in allowed_postags])
return texts_out
# Setence to Words
def sent_to_words(sentences):
for sentence in sentences:
yield(gensim.utils.simple_preprocess(str(sentence), deacc=True)) # deacc=True removes punctuations
corpus_topics = ['TRANSFORMED/1950-1959', 'TRANSFORMED/1960-1969', 'TRANSFORMED/1970-1979', 'TRANSFORMED/1980-1989',
'TRANSFORMED/1990-1999', 'TRANSFORMED/2000-2009', 'TRANSFORMED/2010-2019', 'TRANSFORMED/2020-2029']
flag=True
for corpus_root in corpus_topics:
rows = []
corpus = PlaintextCorpusReader(corpus_root, '.*txt')
text_list = corpus.fileids()
for text in text_list:
if corpus_root in ['TRANSFORMED/1950-1959', 'TRANSFORMED/1960-1969', 'TRANSFORMED/1970-1979']:
identifier = text.split("_")[0]
else:
identifier = text.split("_")[-1][:-4]
rows.append([identifier, " ".join(corpus.words(text))])
# CREATE DATFRAME
df = pd.DataFrame.from_records(rows, columns=['id', 'content'])
# CLEANING
# Convert to list
data = df['content'].values.tolist()
# Remove Emails
data = [re.sub('\S*@\S*\s?', '', sent) for sent in data]
# Remove new line characters
data = [re.sub('\s+', ' ', sent) for sent in data]
# Remove distracting single quotes
data = [re.sub("\'", "", sent) for sent in data]
# Sentence to words
data_words = list(sent_to_words(data))
# BUILD THE BIGRAM AND TRIGRAM MODELS
bigram = gensim.models.Phrases(data_words, min_count=5, threshold=100) # higher threshold fewer phrases.
# Faster way to get a sentence clubbed as a bigram
bigram_mod = gensim.models.phrases.Phraser(bigram)
# Remove Stop Words
data_words_nostops = remove_stopwords(data_words)
# Do lemmatization keeping only noun, adj, vb, adv
data_lemmatized = lemmatization(data_words_nostops, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV'])
# INPUT FOR CREATING THE DICTIONARY AND CORPUS NEEDED FOR TOPIC MODELING
if not flag:
texts.extend(data_lemmatized)
else:
texts = data_lemmatized
flag = False
dictionary = Dictionary(texts)
corpus = [dictionary.doc2bow(text) for text in texts]
print("*"*20+"FINISHED"+"*"*20)
# with open('news_texts.pkl', 'wb') as f:
# pickle.dump(texts, f)
# with open('news_corpus.pkl', 'wb') as f:
# pickle.dump(corpus, f)
# with open('news_dictionary.pkl', 'wb') as f:
# pickle.dump(dictionary, f)
print("Started loading..")
with open('news_corpus.pkl', 'rb') as f:
corpus = pickle.load(f)
with open('news_dictionary.pkl', 'rb') as f:
dictionary = pickle.load(f)
print("Finished loading..")
print("Started training..")
time_slice = [1051, 2900, 1258, 1771, 1520, 2010, 2852, 412]
lda_seq = LdaSeqModel(corpus=corpus, time_slice=time_slice, id2word=dictionary, num_topics=20, chunksize=200, passes=1)
print("Finished training..")
# print("Saving model..")
# with open('lda_seq_model.pkl', 'wb') as f:
# pickle.dump(lda_seq, f)
# print("Saved model!")
with open('lda_seq_model.pkl', 'rb') as f:
ldaseq = pickle.load(f)
print("Started loading..")
with open('news_corpus.pkl', 'rb') as f:
corpus = pickle.load(f)
with open('news_dictionary.pkl', 'rb') as f:
dictionary = pickle.load(f)
with open('news_texts.pkl', 'rb') as f:
texts = pickle.load(f)
print("Finished loading..")
To print all topics from a particular time-period, simply use print_topics. The input parameter to print_topics is a time-slice option. By passing 0 we are seeing the topics in the 1st time-slice. The result would be a list of lists, where each individual list contains a tuple of the most probable words in the topic. i.e (word, word_probability)
# 20 topcis per time slice. Each topic is made up of keywords.
ldaseq.print_topics(time=0)
ldaseq.print_topic_times(topic=0) # evolution of 1st topic
ldaseq.print_topic_times(topic=0) # evolution of 1st topic
doc_topic, topic_term, doc_lengths, term_frequency, vocab = ldaseq.dtm_vis(time=0, corpus=corpus)
vis_wrapper = pyLDAvis.prepare(topic_term_dists=topic_term, doc_topic_dists=doc_topic, doc_lengths=doc_lengths, vocab=vocab, term_frequency=term_frequency)
pyLDAvis.display(vis_wrapper)
doc_topic, topic_term, doc_lengths, term_frequency, vocab = ldaseq.dtm_vis(time=1, corpus=corpus)
vis_wrapper = pyLDAvis.prepare(topic_term_dists=topic_term, doc_topic_dists=doc_topic, doc_lengths=doc_lengths, vocab=vocab, term_frequency=term_frequency)
pyLDAvis.display(vis_wrapper)
doc_topic, topic_term, doc_lengths, term_frequency, vocab = ldaseq.dtm_vis(time=2, corpus=corpus)
vis_wrapper = pyLDAvis.prepare(topic_term_dists=topic_term, doc_topic_dists=doc_topic, doc_lengths=doc_lengths, vocab=vocab, term_frequency=term_frequency)
pyLDAvis.display(vis_wrapper)
doc_topic, topic_term, doc_lengths, term_frequency, vocab = ldaseq.dtm_vis(time=3, corpus=corpus)
vis_wrapper = pyLDAvis.prepare(topic_term_dists=topic_term, doc_topic_dists=doc_topic, doc_lengths=doc_lengths, vocab=vocab, term_frequency=term_frequency)
pyLDAvis.display(vis_wrapper)
doc_topic, topic_term, doc_lengths, term_frequency, vocab = ldaseq.dtm_vis(time=4, corpus=corpus)
vis_wrapper = pyLDAvis.prepare(topic_term_dists=topic_term, doc_topic_dists=doc_topic, doc_lengths=doc_lengths, vocab=vocab, term_frequency=term_frequency)
pyLDAvis.display(vis_wrapper)
doc_topic, topic_term, doc_lengths, term_frequency, vocab = ldaseq.dtm_vis(time=5, corpus=corpus)
vis_wrapper = pyLDAvis.prepare(topic_term_dists=topic_term, doc_topic_dists=doc_topic, doc_lengths=doc_lengths, vocab=vocab, term_frequency=term_frequency)
pyLDAvis.display(vis_wrapper)
doc_topic, topic_term, doc_lengths, term_frequency, vocab = ldaseq.dtm_vis(time=6, corpus=corpus)
vis_wrapper = pyLDAvis.prepare(topic_term_dists=topic_term, doc_topic_dists=doc_topic, doc_lengths=doc_lengths, vocab=vocab, term_frequency=term_frequency)
pyLDAvis.display(vis_wrapper)
doc_topic, topic_term, doc_lengths, term_frequency, vocab = ldaseq.dtm_vis(time=7, corpus=corpus)
vis_wrapper = pyLDAvis.prepare(topic_term_dists=topic_term, doc_topic_dists=doc_topic, doc_lengths=doc_lengths, vocab=vocab, term_frequency=term_frequency)
pyLDAvis.display(vis_wrapper)
for time in range(0, 8):
topics_dtm = ldaseq.dtm_coherence(time=time)
cm_DTM = CoherenceModel(topics=topics_dtm, corpus=corpus, dictionary=dictionary, coherence='u_mass')
print("U_mass topic coherence for time slice {} is {}".format(time, cm_DTM.get_coherence()))
cm_DTM = CoherenceModel(topics=topics_dtm, texts=texts, dictionary=dictionary, coherence='c_v')
print("C_v topic coherence for time slice {} is {}\n".format(time, cm_DTM.get_coherence()))